### This R script is to run models for the average change in teacher variables from 2019 to 2023 on 
### school/district characteristics using a hierarchal linear model. Instructions:

# Regress the average change from 2019-2023 for school i in district j onto school characteristics and 
# district characteristics. 
# -	School characteristics: IPR(130), % Black
# -	District characteristics: mode of instruction, median household income

# - Run regressions with each characteristic alone, then with all four together
# - Weight the regressions by number of teachers. 

### open necessary packages
library(tidyverse)
library(lme4)
library(lmerTest)
library(MuMIn)

# set your directory:
setwd("/Users/olgabaker/Desktop/Replication Package Summer 2025/Data")
data_19 <- read.csv("core_19.csv")
data_23 <- read.csv("core_23.csv")

### divide median household income by $1000 (so the income variable is in thousands of dollars). 
data_19$median_income <- data_19$median_income/1000

## Make names of teacher variables consistent across years. Rename variables if their variable name changed in a given 
# year. Change it to match the name in the "variable name" column of "Teacher Variables.xlsx"
data_19 <- data_19 %>% rename(cdis1=cdis)

## import list of teacher variables
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.name
varnames <- varnames[!is.na(varnames)] 

### 1) Get the average change from 2019-2023 for each school by taking the difference 
# for every teacher variable and averaging over these values. 
# Drop schools that are not present in both years or have a row of missing values 
# when you take differences. Also drop the variable "data" because it isn't available in 2023.
teacher_vars_19 <- select(data_19, ncessch, any_of(varnames))
teacher_vars_23 <- select(data_23, ncessch, any_of(varnames))
teacher_vars_19 <- select(teacher_vars_19, -data)

teacher_vars_19 <- filter(teacher_vars_19, ncessch %in% teacher_vars_23$ncessch)
teacher_vars_23 <- filter(teacher_vars_23, ncessch %in% teacher_vars_19$ncessch)

# check that column names and rows are in the right positions:
colnames(teacher_vars_19)==colnames(teacher_vars_23)
teacher_vars_19 <- arrange(teacher_vars_19, ncessch)
teacher_vars_23 <- arrange(teacher_vars_23, ncessch)
sum(teacher_vars_19$ncessch==teacher_vars_23$ncessch)==nrow(teacher_vars_19)

difference_19_23 <- cbind(ncessch=teacher_vars_19$ncessch,
      as.matrix(select(teacher_vars_23,-ncessch)) - 
        as.matrix(select(teacher_vars_19,-ncessch)))

# check work
head(difference_19_23,3)
head(teacher_vars_23,3)
head(teacher_vars_19,3)

# average over the differences for each school
difference_19_23 <- as.data.frame(difference_19_23)
difference_19_23$avg_change <- rowMeans(select(difference_19_23, -ncessch), na.rm = T)
sum(is.na(difference_19_23$avg_change)) # 531 schools have no average change - drop these
difference_19_23 <- filter(difference_19_23, !is.na(avg_change))
nrow(difference_19_23) # ended up with 3251 schools

# merge in the difference into 2019 data
data_19 <- left_join(data_19, select(difference_19_23, ncessch, avg_change))
summary(data_19$avg_change)

# drop schools missing an average change value - there are 613 of them
data_19 <- filter(data_19, !is.na(avg_change))

# 2) Run univariate regressions
# IPR(130)
model_ipr_130 <- lmer(avg_change~1+ipr_130+(1|leaid),
                      data=data_19, na.action=na.omit, weights=teacher_n,REML=FALSE)
summary(model_ipr_130)

# Black
model_Black <- lmer(avg_change~1+Black+(1|leaid),
                      data=data_19, na.action=na.omit, weights=teacher_n,REML=FALSE)
summary(model_Black)

# median_income
model_median_income <- lmer(avg_change~1+median_income+(1|leaid),
                    data=data_19, na.action=na.omit, weights=teacher_n,REML=FALSE)
summary(model_median_income)

# mode_of_instruction
model_mode_of_instruction <- lmer(avg_change~1+share_virtual+share_hybrid+(1|leaid),
                    data=data_19, na.action=na.omit, weights=teacher_n,REML=FALSE)
summary(model_mode_of_instruction)

# 3) Run model with all variables 
model_all_vars <- lmer(avg_change~1+ipr_130+Black+median_income+share_virtual+share_hybrid+(1|leaid),
               data=data_19, na.action=na.omit, weights=teacher_n,REML=FALSE)
summary(model_all_vars)
r.squaredGLMM(model_all_vars)

# 4) Make table 
# Coefficients with standard errors underneath, and a star to indicate 
# statistical significance at the 5 percent level (to match the other tables)
#
# a) get tables of coefficients, p-values and s.e.'s
# b) get table of N and R^2
# c) Combine

# a) 
## coefficients 
Reg_coefficients <- matrix(NA,5,5)
Reg_coefficients <- as.data.frame(Reg_coefficients)
rownames(Reg_coefficients) <- c("ipr_130","Black","median_income","share_virtual","share_hybrid")
colnames(Reg_coefficients) <- c("reg_1","reg_2","reg_3","reg_4","reg_5")

Reg_coefficients["ipr_130","reg_1"] <- fixef(model_ipr_130)[2]
Reg_coefficients["Black","reg_2"] <- fixef(model_Black)[2]
Reg_coefficients["median_income","reg_3"] <- fixef(model_median_income)[2]
Reg_coefficients["share_virtual","reg_4"] <- fixef(model_mode_of_instruction)["share_virtual"]
Reg_coefficients["share_hybrid","reg_4"] <- fixef(model_mode_of_instruction)["share_hybrid"]

Reg_coefficients["ipr_130","reg_5"] <- fixef(model_all_vars)["ipr_130"]
Reg_coefficients["Black","reg_5"] <- fixef(model_all_vars)["Black"]
Reg_coefficients["median_income","reg_5"] <- fixef(model_all_vars)["median_income"]
Reg_coefficients["share_virtual","reg_5"] <- fixef(model_all_vars)["share_virtual"]
Reg_coefficients["share_hybrid","reg_5"] <- fixef(model_all_vars)["share_hybrid"]


## get table of p-values:
Reg_pvals <- matrix(NA,5,5)
Reg_pvals <- as.data.frame(Reg_pvals)
rownames(Reg_pvals) <- c("ipr_130","Black", "median_income","share_virtual","share_hybrid")
colnames(Reg_pvals) <- c("reg_1","reg_2","reg_3","reg_4","reg_5")

Reg_pvals["ipr_130","reg_1"] <- summary(model_ipr_130)$coefficients[,"Pr(>|t|)"]['ipr_130']
Reg_pvals["Black","reg_2"] <- summary(model_Black)$coefficients[,"Pr(>|t|)"]['Black']
Reg_pvals["median_income","reg_3"] <- summary(model_median_income)$coefficients[,"Pr(>|t|)"]['median_income']
Reg_pvals["share_virtual","reg_4"] <- summary(model_mode_of_instruction)$coefficients[,"Pr(>|t|)"]['share_virtual']
Reg_pvals["share_hybrid","reg_4"] <- summary(model_mode_of_instruction)$coefficients[,"Pr(>|t|)"]['share_hybrid']

Reg_pvals["ipr_130","reg_5"] <- summary(model_all_vars)$coefficients[,"Pr(>|t|)"]['ipr_130']
Reg_pvals["Black","reg_5"] <- summary(model_all_vars)$coefficients[,"Pr(>|t|)"]['Black']
Reg_pvals["median_income","reg_5"] <- summary(model_all_vars)$coefficients[,"Pr(>|t|)"]['median_income']
Reg_pvals["share_virtual","reg_5"] <- summary(model_all_vars)$coefficients[,"Pr(>|t|)"]['share_virtual']
Reg_pvals["share_hybrid","reg_5"] <- summary(model_all_vars)$coefficients[,"Pr(>|t|)"]['share_hybrid']


stat_sig <- Reg_pvals<=.05

## standard errors
Reg_std_errors <- matrix(NA,5,5)
Reg_std_errors <- as.data.frame(Reg_std_errors)
rownames(Reg_std_errors) <- c("ipr_130","Black","median_income","share_virtual","share_hybrid")
colnames(Reg_std_errors) <- c("reg_1","reg_2","reg_3","reg_4","reg_5")

Reg_std_errors["ipr_130","reg_1"] <- summary(model_ipr_130)$coefficients[,"Std. Error"]['ipr_130']
Reg_std_errors["Black","reg_2"] <- summary(model_Black)$coefficients[,"Std. Error"]['Black']
Reg_std_errors["median_income","reg_3"] <- summary(model_median_income)$coefficients[,"Std. Error"]['median_income']
Reg_std_errors["share_virtual","reg_4"] <- summary(model_mode_of_instruction)$coefficients[,"Std. Error"]['share_virtual']
Reg_std_errors["share_hybrid","reg_4"] <- summary(model_mode_of_instruction)$coefficients[,"Std. Error"]['share_hybrid']

Reg_std_errors["ipr_130","reg_5"] <- summary(model_all_vars)$coefficients[,"Std. Error"]['ipr_130']
Reg_std_errors["Black","reg_5"] <- summary(model_all_vars)$coefficients[,"Std. Error"]['Black']
Reg_std_errors["median_income","reg_5"] <- summary(model_all_vars)$coefficients[,"Std. Error"]['median_income']
Reg_std_errors["share_virtual","reg_5"] <- summary(model_all_vars)$coefficients[,"Std. Error"]['share_virtual']
Reg_std_errors["share_hybrid","reg_5"] <- summary(model_all_vars)$coefficients[,"Std. Error"]['share_hybrid']

# b) - table of N, R^2
# lmer gives marginal and conditional R^2. The first represents the variance explained only by the fixed effects. 
# The second represents the variance explained by both the fixed and random effects.
# Make a marginal R^2 row and a conditional R^2 row
Reg_n_r2 <-  matrix(NA,1,5)
rownames(Reg_n_r2) <- c("N")
colnames(Reg_n_r2) <- c("reg_1","reg_2","reg_3","reg_4","reg_5")

Reg_n_r2["N","reg_1"] <- nobs(model_ipr_130)
Reg_n_r2["N","reg_2"] <- nobs(model_Black)
Reg_n_r2["N","reg_3"] <- nobs(model_median_income)
Reg_n_r2["N","reg_4"] <- nobs(model_mode_of_instruction)
Reg_n_r2["N","reg_5"] <- nobs(model_all_vars)

r2 <- t(rbind(r.squaredGLMM(model_ipr_130),r.squaredGLMM(model_Black),r.squaredGLMM(model_median_income),
              r.squaredGLMM(model_mode_of_instruction),r.squaredGLMM(model_all_vars)))
Reg_n_r2 <- rbind(Reg_n_r2,r2)

# 3) Combine
# make a dataframe with asterisks where result is statistically significant:
asterisk <- matrix(NA, nrow(stat_sig), ncol(stat_sig))
asterisk <- as.data.frame(asterisk)
colnames(asterisk) <- colnames(stat_sig)
rownames(asterisk) <- rownames(stat_sig)

for(j in 1:ncol(stat_sig)){
  for(i in 1:nrow(stat_sig)){
    if(is.na(stat_sig[i,j])){asterisk[i,j] <- ""}
    else if(stat_sig[i,j]){asterisk[i,j] <- "*"}
    else{asterisk[i,j] <- ""}
  }
}
asterisk; stat_sig

# scale by 20, round and combine 
Reg_coefficients <- Reg_coefficients/20
Reg_std_errors <- Reg_std_errors/20
Reg_coefficients <- round(Reg_coefficients,3)
Reg_std_errors <- round(Reg_std_errors,3)

parentheses_fcn <- function(vec){
  for(i in 1:length(vec)){
    if(!is.na(vec[i])){vec[i] <- paste("(",vec[i],")", sep="")}
  } 
  return(vec)}
Reg_std_errors <- apply(Reg_std_errors,2,parentheses_fcn)

# Make final table:
Reg_table <- matrix(NA, nrow(Reg_coefficients), ncol(Reg_coefficients))
rownames(Reg_table) <- rownames(Reg_coefficients)
colnames(Reg_table) <- colnames(Reg_coefficients)
Reg_table <- as.data.frame(Reg_table)

Reg_table$reg_1 <- sprintf("%s%s %s", Reg_coefficients[,"reg_1"], asterisk[,"reg_1"], Reg_std_errors[,"reg_1"])
Reg_table$reg_2 <- sprintf("%s%s %s", Reg_coefficients[,"reg_2"], asterisk[,"reg_2"], Reg_std_errors[,"reg_2"])
Reg_table$reg_3 <- sprintf("%s%s %s", Reg_coefficients[,"reg_3"], asterisk[,"reg_3"], Reg_std_errors[,"reg_3"])
Reg_table$reg_4 <- sprintf("%s%s %s", Reg_coefficients[,"reg_4"], asterisk[,"reg_4"], Reg_std_errors[,"reg_4"])
Reg_table$reg_5 <- sprintf("%s%s %s", Reg_coefficients[,"reg_5"], asterisk[,"reg_5"], Reg_std_errors[,"reg_5"])

Reg_table <- rbind(Reg_table, N=round(Reg_n_r2["N",]),format(round(Reg_n_r2[c('R2m','R2c'),],6),scientific = F))

write.csv(Reg_table,"HLM_table.csv")

